library(ggplot2)
library(ggpubr)
library(CDM)
library(boot)
library(tidyverse)
library(dummy)
dummy 0.1.3
dummyNews()
library(stringi)
library(stringr)
rm(list = ls())
x_pre <- read_csv("../data/FirstYearProject/OUTPUT.csv")
Parsed with column specification:
cols(
.default = col_character(),
SubjectID = [32mcol_double()[39m,
`Auto Score 1` = [32mcol_double()[39m,
`Auto Score 2` = [32mcol_double()[39m,
`Auto Score 3` = [32mcol_double()[39m,
`Auto Score 4` = [32mcol_double()[39m,
`Auto Score 5` = [32mcol_double()[39m,
`Auto Score 6` = [32mcol_double()[39m,
`Auto Score 7` = [32mcol_double()[39m,
`Auto Score 8` = [32mcol_double()[39m,
`Auto Score 9` = [32mcol_double()[39m,
`Auto Score 10` = [32mcol_double()[39m,
`Auto Score 11` = [32mcol_double()[39m,
`Auto Score 12` = [32mcol_double()[39m,
`Auto Score 13` = [32mcol_double()[39m,
`Auto Score 14` = [32mcol_double()[39m,
`Auto Score 15` = [32mcol_double()[39m,
`Auto Score 16` = [32mcol_double()[39m,
`Auto Score 17` = [32mcol_double()[39m,
`Auto Score 18` = [32mcol_double()[39m,
`Auto Score 19` = [32mcol_double()[39m
# ... with 34 more columns
)
See spec(...) for full column specifications.
Q_from_book <- read_csv("../data/FirstYearProject/final_result_similar.csv") %>% mutate(`Learning Objective` = `Topic`)
Parsed with column specification:
cols(
Question = [31mcol_character()[39m,
Option1 = [31mcol_character()[39m,
Option2 = [31mcol_character()[39m,
Option3 = [31mcol_character()[39m,
Option4 = [31mcol_character()[39m,
Answer = [31mcol_character()[39m,
`Learning Objective` = [31mcol_character()[39m,
Topic = [31mcol_character()[39m,
`Difficulty Level` = [31mcol_character()[39m,
`Skill Level` = [31mcol_character()[39m,
`APA Learning Objective` = [31mcol_character()[39m
)
Q_from_book <- Q_from_book %>%
mutate(`Learning Objective` = str_trim(str_remove_all(`APA Learning Objective`, "\\."))) %>%
filter(`Learning Objective` != "nan")
glimpse(Q_from_book)
Observations: 992
Variables: 11
$ Question [3m[38;5;246m<chr>[39m[23m "Which of the following is an example of social ...
$ Option1 [3m[38;5;246m<chr>[39m[23m "a. You feel guilty because you lied to your tru...
$ Option2 [3m[38;5;246m<chr>[39m[23m "b. When you get hungry, you have trouble concen...
$ Option3 [3m[38;5;246m<chr>[39m[23m "c. You didn\u0092t do well on the test because ...
$ Option4 [3m[38;5;246m<chr>[39m[23m "d. You almost fall asleep at the wheel, so you ...
$ Answer [3m[38;5;246m<chr>[39m[23m "A", "A", "D", "C", "A", "C", "C", "B", "D", "D"...
$ `Learning Objective` [3m[38;5;246m<chr>[39m[23m "11 Describe key concepts, principles, and overa...
$ Topic [3m[38;5;246m<chr>[39m[23m "Defining Social Psychology", "Defining Social P...
$ `Difficulty Level` [3m[38;5;246m<chr>[39m[23m "Moderate", "Moderate", "Moderate", "Moderate", ...
$ `Skill Level` [3m[38;5;246m<chr>[39m[23m "Understand the Concepts", "Understand the Conce...
$ `APA Learning Objective` [3m[38;5;246m<chr>[39m[23m "1.1 Describe key concepts, principles, and over...
Q_from_book %>% distinct(`Skill Level`)
NA
learning_obj <- Q_from_book %>%
distinct(`Learning Objective`) %>%
mutate(lo_id = row_number())
Q_pre <- Q_from_book %>% inner_join(learning_obj) %>% select(Question, `Learning Objective`, lo_id) %>% mutate(temp = str_trim(str_replace_all(Question, "_|\\.", "")))
Joining, by = "Learning Objective"
learning_obj
Q_pre <- Q_from_book %>% inner_join(learning_obj) %>% select(Question, `Learning Objective`, lo_id) %>%
mutate(temp = str_trim(str_replace_all(Question, "_|\\.", ""))) %>%
mutate(Q_UNIQUE_ID = row_number())
Joining, by = "Learning Objective"
Q_pre
NA
head(x_pre)
NA
x.gather <-x_pre %>% gather(key = "key", value = "value", -File, -SubjectID)
x.gather
x.questions <-
x.gather %>% filter(str_detect(key, "Question")) %>%
anti_join(
x.gather %>% filter(str_detect(key, "Question")) %>%
group_by(File, SubjectID, value) %>%
summarise(cnt = n(), question_number = paste(key, collapse = ",")) %>%
filter(cnt > 1) %>% ungroup(),
by = "value"
) # Taking out generic questions (having same question text but different answers)
x.questions.dist <- x.questions %>% distinct(value) %>% drop_na() %>%
#mutate(Q_UNIQUE_ID = row_number()) %>%
mutate(temp = str_trim(str_replace_all(value, "_|\\.", ""))) %>%
inner_join(
Q_pre, by = "temp"
)
x.questions.dist %>% write_csv("../data/FirstYearProject/Q_distinct_id.csv")
x.questions.dist
Q <- x.questions.dist %>% distinct(Q_UNIQUE_ID, lo_id) %>% arrange(Q_UNIQUE_ID) %>%
mutate(present = 1) %>%
spread(key = "lo_id", value = "present")
Q %>%
mutate_all(function(x) ifelse(is.na(x), 0, x)) %>%
write_csv("../data/FirstYearProject/Q.csv")
Q
NA
x.answers <-
x.gather %>% filter(!str_detect(key, "Question"))
x.answers
#Total Questions presented to students 53 Questions are randomly presented to students
x.questions %>% distinct(key)
x.questions.id <- x.questions %>% inner_join(x.questions.dist) #%>% mutate(Q_UNIQUE_ID = factor(Q_UNIQUE_ID))
Joining, by = "value"
x.questions.id
Questions with same text but different Answers
x.questions.id.filterd <- x.questions.id %>%
anti_join(
x.questions.id %>%
group_by(File, SubjectID, Question) %>%
summarise(cnt = n(), question_number = paste(key, collapse = ",")) %>%
filter(cnt > 1) %>% ungroup(),
by = "Question"
) %>% select(-lo_id, -`Learning Objective`)
x.questions.id.filterd
NA
We have the correct Questions. Now we need to add marks of answers against the questions.
X.pre <- x.questions.id %>% mutate(id = str_split(key, " ", simplify = TRUE)[,2]) %>%
inner_join(
x.answers %>% mutate(id = str_split(key, " ", simplify = TRUE)[,3]), by = c("File", "SubjectID", "id")
) %>%
mutate(value.y = as.integer(value.y)) #%>%
#mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID))
#write_csv(X.pre, "X_Pre.csv")
X.pre
unique(X.pre$Q_UNIQUE_ID)
[1] 19 182 246 299 118 91 114 88 82 9 259 105 78 13 260 49 17 148 273 288
[21] 40 166 66 202 225 177 126 204 140 293 116 136 186 223 149 229 122 281 137 64
[41] 291 67 33 30 271 504 966 480 972 477 981 493 496 460 522 465 403 376 441 354
[61] 529 514 474 430 439 446 348 416 323 356 473 501 986 515 371 331 497 380 505 368
[81] 343 520 345 467 369 980 483 412 418 604 955 600 590 562 612 549 605 591 610 557
[ reached getOption("max.print") -- omitted 839 entries ]
X<- X.pre %>% select(-key.x, -key.y, -value.x, -id, -temp, -lo_id, -`Learning Objective`, -Question ) %>%
spread(key = "Q_UNIQUE_ID", value = "value.y")
write_csv(X, "../data/FirstYearProject/X.csv")
X
Let’s run some test to verify X
X %>% select(-File, -SubjectID) %>% summarise_all(sum, na.rm = TRUE)
NA
X %>% gather(key = "QuestionID", value = "Score", -File, -SubjectID)
NA
library(janitor)
X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols")
NA
question_attempted <- X %>% remove_empty(.,which = "cols") %>%
gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>%
group_by(File, QuestionID) %>%
summarise(total_na = sum(is.na(Scores)), total = n(), total_attempted = total - total_na)
question_attempted <- question_attempted %>% filter(total_attempted >= 8)
question_attempted
#%>% filter(QuestionID == "103")
Filtering out questions with lesser attempts
X_filtered <- X %>% remove_empty(.,which = "cols") %>%
gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>% semi_join(question_attempted, by = c("File", "QuestionID")) %>%
spread(key = "QuestionID", value = "Scores")
X_filtered
X %>% remove_empty(.,which = "cols") %>% write_csv("../data/FirstYearProject/X.csv")
X_filtered %>% remove_empty(.,which = "cols") %>% write_csv("../data/FirstYearProject/X_filtered.csv")
Write CSVs seperate for each trial to avoid having columns for those questions that were not asked in a trial. This will help to show the true picture of sparsity.
fn.clean <- function (df) {
return(df %>% remove_empty(.,which = "cols"))
}
X.individual.list <- X %>%
nest(-File, .key = "X_full") %>%
mutate(X = map(X_full, fn.clean),
Q_full = map(X_full, function(df) return (Q)))
X.individual.list
[38;5;246m# A tibble: 8 x 4[39m
File X_full X Q_full
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<list>[39m[23m [3m[38;5;246m<list>[39m[23m [3m[38;5;246m<list>[39m[23m
[38;5;250m1[39m Exam1Trial1 [38;5;246m<tibble [74 x 940]>[39m [38;5;246m<tibble [74 x 286]>[39m [38;5;246m<tibble [939 x 5]>[39m
[38;5;250m2[39m Exam1Trial2 [38;5;246m<tibble [57 x 940]>[39m [38;5;246m<tibble [57 x 277]>[39m [38;5;246m<tibble [939 x 5]>[39m
[38;5;250m3[39m Exam2Trial1 [38;5;246m<tibble [66 x 940]>[39m [38;5;246m<tibble [66 x 236]>[39m [38;5;246m<tibble [939 x 5]>[39m
[38;5;250m4[39m Exam2Trial2 [38;5;246m<tibble [67 x 940]>[39m [38;5;246m<tibble [67 x 237]>[39m [38;5;246m<tibble [939 x 5]>[39m
[38;5;250m5[39m Exam3Trial1 [38;5;246m<tibble [47 x 940]>[39m [38;5;246m<tibble [47 x 178]>[39m [38;5;246m<tibble [939 x 5]>[39m
[38;5;250m6[39m Exam3Trial2 [38;5;246m<tibble [78 x 940]>[39m [38;5;246m<tibble [78 x 179]>[39m [38;5;246m<tibble [939 x 5]>[39m
[38;5;250m7[39m Exam4Trial1 [38;5;246m<tibble [64 x 940]>[39m [38;5;246m<tibble [64 x 239]>[39m [38;5;246m<tibble [939 x 5]>[39m
[38;5;250m8[39m Exam4Trial2 [38;5;246m<tibble [72 x 940]>[39m [38;5;246m<tibble [72 x 239]>[39m [38;5;246m<tibble [939 x 5]>[39m
X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols")
NA
Q
NA
fn.skills <- function (df) {
df <- df %>% remove_empty(.,which = "cols") %>%
gather(key = "Q_UNIQUE_ID", value = "Score", -SubjectID) %>%
mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID)) %>% distinct(Q_UNIQUE_ID) %>%
inner_join(
Q
) %>% remove_empty(.,which = "cols") %>% mutate_all(function(x) ifelse(is.na(x), 0, x))
return(df)
}
X.Q <- X.individual.list %>%
mutate(Q = map(X, fn.skills))
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
X.Q
[38;5;246m# A tibble: 8 x 5[39m
File X_full X Q_full Q
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<list>[39m[23m [3m[38;5;246m<list>[39m[23m [3m[38;5;246m<list>[39m[23m [3m[38;5;246m<list>[39m[23m
[38;5;250m1[39m Exam1Trial1 [38;5;246m<tibble [74 x 940[0m~ [38;5;246m<tibble [74 x 286[0m~ [38;5;246m<tibble [939 x 5[0m~ [38;5;246m<tibble [285 x [0m~
[38;5;250m2[39m Exam1Trial2 [38;5;246m<tibble [57 x 940[0m~ [38;5;246m<tibble [57 x 277[0m~ [38;5;246m<tibble [939 x 5[0m~ [38;5;246m<tibble [276 x [0m~
[38;5;250m3[39m Exam2Trial1 [38;5;246m<tibble [66 x 940[0m~ [38;5;246m<tibble [66 x 236[0m~ [38;5;246m<tibble [939 x 5[0m~ [38;5;246m<tibble [235 x [0m~
[38;5;250m4[39m Exam2Trial2 [38;5;246m<tibble [67 x 940[0m~ [38;5;246m<tibble [67 x 237[0m~ [38;5;246m<tibble [939 x 5[0m~ [38;5;246m<tibble [236 x [0m~
[38;5;250m5[39m Exam3Trial1 [38;5;246m<tibble [47 x 940[0m~ [38;5;246m<tibble [47 x 178[0m~ [38;5;246m<tibble [939 x 5[0m~ [38;5;246m<tibble [177 x [0m~
[38;5;250m6[39m Exam3Trial2 [38;5;246m<tibble [78 x 940[0m~ [38;5;246m<tibble [78 x 179[0m~ [38;5;246m<tibble [939 x 5[0m~ [38;5;246m<tibble [178 x [0m~
[38;5;250m7[39m Exam4Trial1 [38;5;246m<tibble [64 x 940[0m~ [38;5;246m<tibble [64 x 239[0m~ [38;5;246m<tibble [939 x 5[0m~ [38;5;246m<tibble [238 x [0m~
[38;5;250m8[39m Exam4Trial2 [38;5;246m<tibble [72 x 940[0m~ [38;5;246m<tibble [72 x 239[0m~ [38;5;246m<tibble [939 x 5[0m~ [38;5;246m<tibble [238 x [0m~
X %>% filter(File == "Exam2Trial2") %>% remove_empty(.,which = "cols") %>%
gather(key = "Q_UNIQUE_ID", value = "Score", -File, -SubjectID) %>%
mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID)) %>% distinct(Q_UNIQUE_ID) %>%
inner_join(
Q, by = "Q_UNIQUE_ID"
) %>% remove_empty(.,which = "cols") %>% mutate_all(function(x) ifelse(is.na(x), 0, x)) %>% summarise_all(sum)
[38;5;246m# A tibble: 1 x 2[39m
Q_UNIQUE_ID `1`
[3m[38;5;246m<int>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m [4m1[24m[4m1[24m[4m3[24m706 236
X %>% filter(File == "Exam1Trial1")
NA
fn.write <- function(File, X_full, X, Q_full, Q) {
print(X)
X %>% write_csv(paste0("../data/FirstYearProject/",File,"_X.csv"))
Q %>% write_csv(paste0("../data/FirstYearProject/",File,"_Q.csv"))
}
#walk2(X.Q$File, X.Q$data_clean, X.Q$data_Q_skills, fn.write)
pwalk(X.Q, fn.write)
NA
NA